In [269]:

    
%matplotlib inline
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from pandas import DataFrame
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.base import TransformerMixin
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import f_classif
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score



In [270]:

    
data_dict = pickle.load(open("../ud120-projects/final_project/final_project_dataset.pkl", "r") )

Holdout

Since the dataset for this project is so small, a hold-out set will not be used, and only k-fold testing and training splits will be used to measure accuracy.

This is because even with a stratified hold-out set of 20%, with only 146 data points, lots of missing data and and 18 poi's, there would be only 3 or so points to do a final test on. This does not give much confidence in the precision of the performance metrics on such a small hold-out set, while also negatively impacting the ability to create the model.

"when the number of samples is not large, a strong case can be made that a test set should be avoided because every sample may be needed for model building. (...) Additionally, the size of the test set may not have sufficient power or precision to make reasonable judgements. "

[1] Kuhn M., Kjell J.(2013). Applied Predictive Modeling. Springer. pp.67

Hawkins et al. (2003) concisely summarize this point:“holdout samples of tolerable size [. . . ] do not match the cross-validation itself for reliability in assessing model fit and are hard to motivate.”

[2] Hawkins D, Basak S, Mills D (2003). “Assessing Model Fit by Cross– Validation.” Journal of Chemical Information and Computer Sciences, 43(2), 579–586

This will be addressed with K-fold cross-validation resampling techniques.

Version 2 - Cross Validation Scheme

Define sets of model parameters values to evaluate
for each parameter set in grid search DO
1. For each k-fold resampling iteration DO
  1. Hold-out 1/k samples/fold
  2. Pre-Process Data (Create functions on training set, apply to test set with same)
    1. Impute data (median)
    2. Scale features (x_i - mean))/std
    3. Perform any univariate feature selection (remove very low variation features)
    4. Modeling feature selection (ExtraTreesClassifier)
  3. Fit the model on the k/K training fold
  4. Predict the hold-out samples/fold
2. END
3. Calculate the average performance across hold-out predictions
END
Determine the optimal parameter set
Fit the final model to all training data using the optimal parameter set

Fix 2 out-of-sync records



In [271]:

    
data_dict['BELFER ROBERT'] = {'bonus': 'NaN',
                              'deferral_payments': 'NaN',
                              'deferred_income': -102500,
                              'director_fees': 102500,
                              'email_address': 'NaN',
                              'exercised_stock_options': 'NaN',
                              'expenses': 3285,
                              'from_messages': 'NaN',
                              'from_poi_to_this_person': 'NaN',
                              'from_this_person_to_poi': 'NaN',
                              'loan_advances': 'NaN',
                              'long_term_incentive': 'NaN',
                              'other': 'NaN',
                              'poi': False,
                              'restricted_stock': -44093,
                              'restricted_stock_deferred': 44093,
                              'salary': 'NaN',
                              'shared_receipt_with_poi': 'NaN',
                              'to_messages': 'NaN',
                              'total_payments': 3285,
                              'total_stock_value': 'NaN'}

data_dict['BHATNAGAR SANJAY'] = {'bonus': 'NaN',
                                 'deferral_payments': 'NaN',
                                 'deferred_income': 'NaN',
                                 'director_fees': 'NaN',
                                 'email_address': 'sanjay.bhatnagar@enron.com',
                                 'exercised_stock_options': 15456290,
                                 'expenses': 137864,
                                 'from_messages': 29,
                                 'from_poi_to_this_person': 0,
                                 'from_this_person_to_poi': 1,
                                 'loan_advances': 'NaN',
                                 'long_term_incentive': 'NaN',
                                 'other': 'NaN',
                                 'poi': False,
                                 'restricted_stock': 2604490,
                                 'restricted_stock_deferred': -2604490,
                                 'salary': 'NaN',
                                 'shared_receipt_with_poi': 463,
                                 'to_messages': 523,
                                 'total_payments': 137864,
                                 'total_stock_value': 15456290}



In [484]:

    
df = pd.DataFrame.from_dict(data_dict, orient='index')
df = df.drop('TOTAL', axis=0)









    Out[484]:





salary                       object
to_messages                  object
deferral_payments            object
total_payments               object
exercised_stock_options      object
bonus                        object
restricted_stock             object
shared_receipt_with_poi      object
restricted_stock_deferred    object
total_stock_value            object
expenses                     object
loan_advances                object
from_messages                object
other                        object
from_this_person_to_poi      object
poi                            bool
director_fees                object
deferred_income              object
long_term_incentive          object
email_address                object
from_poi_to_this_person      object
dtype: object

'NaN' was imported as a string instead of a missing value. We will convert these to NaN type and look how many missing values our data has.



In [485]:

    
# Replace 'NaN' strings with 0's
df = df.replace('NaN', 0)
# Replace email strings with True/False boolean as to whether an email was present or not
# df['email_address'] = df['email_address'].fillna(0).apply(lambda x: x != 0, 1)
# Remove 'email_address' string as a feature
del df['email_address']









    Out[485]:





salary                       int64
to_messages                  int64
deferral_payments            int64
total_payments               int64
exercised_stock_options      int64
bonus                        int64
restricted_stock             int64
shared_receipt_with_poi      int64
restricted_stock_deferred    int64
total_stock_value            int64
expenses                     int64
loan_advances                int64
from_messages                int64
other                        int64
from_this_person_to_poi      int64
poi                           bool
director_fees                int64
deferred_income              int64
long_term_incentive          int64
from_poi_to_this_person      int64
dtype: object



In [464]:



In [427]:

    
df_original = df.copy()



In [884]:

    
# Convert features to floats since MinMaxScaler does not like int64's
X_original = df.drop(['poi'], axis=1).astype(float)
y_original = df['poi']

# Drop any row that has only zeros in it, drop from labels first, then from features
y_original = y_original[X_original.abs().sum(axis=1) != 0]
X_original = X_original[X_original.abs().sum(axis=1) != 0]

# Save the names of the features 
X_names = X_original.columns
#X_original = X_original.apply(lambda x: x.fillna(0), axis=0)

# Scale the features
standardized = MinMaxScaler().fit_transform(X_original)

# Score the features using a classification scoring function using 
# the Anova F-value for the provided sample
selection = SelectKBest(k='all', score_func=f_classif).fit(standardized, y_original)

#new_X = selection.transform(standardized)

#KBestNames = X_names[selection.get_support()]

# Create a pd.DataFrame of the names and scores
scores = pd.DataFrame([X_names, selection.scores_])
scores = scores.T
scores.columns = ['Features', 'Scores']
scores = scores.sort(['Scores'], ascending=False).reset_index(drop=True)
scores









    Out[884]:






  
    
      
      Features
      Scores
    
  
  
    
      0 
                         exercised_stock_options_squared
          25.04327
    
    
      1 
                                       total_stock_value
          22.78211
    
    
      2 
                                 exercised_stock_options
          22.61053
    
    
      3 
                                                   bonus
             21.06
    
    
      4 
                                   bonus_total_pay_ratio
          20.98877
    
    
      5 
                                                  salary
           18.5757
    
    
      6 
                                      total_compensation
          17.18271
    
    
      7 
                     long_term_incentive_total_pay_ratio
          14.01403
    
    
      8 
                                          salary_squared
          13.75712
    
    
      9 
                                         deferred_income
          11.56189
    
    
      10
                                           bonus_squared
          10.69425
    
    
      11
                                     long_term_incentive
          10.07245
    
    
      12
                                 deferred_income_squared
          10.01498
    
    
      13
                                          total_payments
          9.380237
    
    
      14
                                        restricted_stock
          8.964964
    
    
      15
                                   total_poi_interaction
          8.773847
    
    
      16
                                 shared_receipt_with_poi
          8.746486
    
    
      17
                                   loan_advances_squared
          7.307514
    
    
      18
                                           loan_advances
           7.24273
    
    
      19
                          bonus_total_compensation_ratio
          6.731432
    
    
      20
                          shared_poi_from_messages_ratio
          5.793909
    
    
      21
                                                expenses
          5.550684
    
    
      22
                           loan_advances_total_pay_ratio
          5.396396
    
    
      23
                                 from_poi_to_this_person
          5.344942
    
    
      24
                            from_messages_from_poi_ratio
           5.20965
    
    
      25
                         shared_receipt_with_poi_squared
          4.979852
    
    
      26
                            total_active_poi_interaction
          4.955198
    
    
      27
                                    bonus_by_total_stock
          4.920968
    
    
      28
                                           other_squared
          4.828893
    
    
      29
                                restricted_stock_squared
          4.794726
    
    
      ...
      ...
      ...
    
    
      41
                                   director_fees_squared
          1.913096
    
    
      42
                       deferral_payments_total_pay_ratio
          1.779904
    
    
      43
                                             to_messages
          1.698824
    
    
      44
             from_poi_to_this_person_total_poi_int_ratio
          1.429218
    
    
      45
             from_this_person_to_poi_total_poi_int_ratio
          1.245077
    
    
      46
                       expenses_total_compensation_ratio
          1.196422
    
    
      47
                      restricted_stock_total_stock_ratio
          1.128587
    
    
      48
                           to_poi_total_active_poi_ratio
          1.115302
    
    
      49
              deferral_payments_total_compensation_ratio
          1.085361
    
    
      50
                                        expenses_squared
          0.776179
    
    
      51
                               restricted_stock_deferred
         0.7434934
    
    
      52
                                   other_total_pay_ratio
         0.7191198
    
    
      53
                         from_poi_to_this_person_squared
         0.6159672
    
    
      54
                       restricted_stock_deferred_squared
          0.311332
    
    
      55
                           director_fees_total_pay_ratio
         0.2215513
    
    
      56
                                       deferral_payments
         0.2212145
    
    
      57
                  director_fees_total_compensation_ratio
         0.2199087
    
    
      58
                               deferral_payments_squared
         0.1872097
    
    
      59
                                           from_messages
         0.1641645
    
    
      60
       restricted_stock_deferred_total_compensation_r...
             0.142
    
    
      61
                                expenses_total_pay_ratio
         0.1220853
    
    
      62
               exercised_stock_options_total_stock_ratio
        0.06276039
    
    
      63
               restricted_stock_total_compensation_ratio
        0.04006004
    
    
      64
        exercised_stock_options_total_compensation_ratio
        0.03827046
    
    
      65
                          other_total_compensation_ratio
        0.02193787
    
    
      66
                           shared_poi_total_compensation
       0.006134506
    
    
      67
                         salary_total_compensation_ratio
       0.002550889
    
    
      68
                         deferred_income_total_pay_ratio
               NaN
    
    
      69
             restricted_stock_deferred_total_stock_ratio
               NaN
    
    
      70
                deferred_income_total_compensation_ratio
               NaN
    
  

71 rows × 2 columns



In [885]:

    
topKBest = list(scores.Features[0:17])
topKBest









    Out[885]:





['exercised_stock_options_squared',
 'total_stock_value',
 'exercised_stock_options',
 'bonus',
 'bonus_total_pay_ratio',
 'salary',
 'total_compensation',
 'long_term_incentive_total_pay_ratio',
 'salary_squared',
 'deferred_income',
 'bonus_squared',
 'long_term_incentive',
 'deferred_income_squared',
 'total_payments',
 'restricted_stock',
 'total_poi_interaction',
 'shared_receipt_with_poi']



In [867]:

    
ET_selection = ExtraTreesClassifier(n_estimators=1000).fit(standardized, y_original)
#print ET_selection.feature_importances_

ET_new_X = selection.transform(standardized)

# Create a pd.DataFrame of the names and importances
scores = pd.DataFrame(ET_selection.feature_importances_, index=X_names)
#scores = scores.T

scores.columns = ['Importance']
scores = scores.sort(['Importance'], ascending=False)
print "TOP10: \n", list(scores.index[0:9])
print scores
scores.sort(['Importance'], ascending=True).plot(kind='barh')









    



TOP10: 
['exercised_stock_options_squared', 'exercised_stock_options', 'total_stock_value', 'bonus_total_pay_ratio', 'long_term_incentive_total_pay_ratio', 'bonus', 'total_compensation', 'bonus_squared', 'deferred_income']
                                                    Importance
exercised_stock_options_squared                       0.043068
exercised_stock_options                               0.037059
total_stock_value                                     0.033604
bonus_total_pay_ratio                                 0.029329
long_term_incentive_total_pay_ratio                   0.028237
bonus                                                 0.027221
total_compensation                                    0.024189
bonus_squared                                         0.023910
deferred_income                                       0.023385
deferred_income_squared                               0.023150
from_messages_from_poi_ratio                          0.022344
to_messages_to_poi_ratio                              0.020644
other_total_pay_ratio                                 0.020500
expenses                                              0.019571
bonus_by_total_stock                                  0.019283
salary_squared                                        0.019044
other                                                 0.018869
restricted_stock                                      0.018736
other_squared                                         0.018377
salary                                                0.018345
from_this_person_to_poi_squared                       0.018232
other_total_compensation_ratio                        0.018150
restricted_stock_total_stock_ratio                    0.017540
shared_poi_from_messages_ratio                        0.017433
expenses_squared                                      0.017022
restricted_stock_squared                              0.017019
exercised_stock_options_total_stock_ratio             0.016967
shared_receipt_with_poi                               0.016761
from_this_person_to_poi_total_poi_int_ratio           0.016709
bonus_total_compensation_ratio                        0.016191
...                                                        ...
from_poi_to_this_person                               0.013841
total_active_poi_interaction                          0.013830
expenses_total_compensation_ratio                     0.012826
to_poi_total_active_poi_ratio                         0.012313
from_poi_to_this_person_squared                       0.012243
long_term_incentive_total_compensation_ratio          0.012142
from_poi_total_active_poi_ratio                       0.011374
shared_receipt_with_poi_total_poi_int_ratio           0.010444
shared_poi_total_compensation                         0.009914
from_poi_to_this_person_total_poi_int_ratio           0.009603
to_messages                                           0.009144
from_messages                                         0.007447
deferral_payments_total_compensation_ratio            0.006714
deferral_payments                                     0.006280
deferral_payments_total_pay_ratio                     0.006070
deferral_payments_squared                             0.005598
loan_advances                                         0.002572
loan_advances_squared                                 0.002287
loan_advances_total_pay_ratio                         0.001858
loan_advances_total_compensation_ratio                0.001720
restricted_stock_deferred                             0.001679
restricted_stock_deferred_squared                     0.001584
director_fees                                         0.000214
director_fees_squared                                 0.000166
director_fees_total_pay_ratio                         0.000007
restricted_stock_deferred_total_stock_ratio           0.000000
deferred_income_total_pay_ratio                       0.000000
director_fees_total_compensation_ratio                0.000000
deferred_income_total_compensation_ratio              0.000000
restricted_stock_deferred_total_compensation_ratio    0.000000

[71 rows x 1 columns]






    Out[867]:





<matplotlib.axes._subplots.AxesSubplot at 0x31489ac8>



In [868]:









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-868-af103a5dbfcd> in <module>()
----> 1 topKBest = list(scores.Features[0:17])
      2 topKBest

c:\Anaconda\lib\site-packages\pandas\core\generic.pyc in __getattr__(self, name)
   1841                 return self[name]
   1842             raise AttributeError("'%s' object has no attribute '%s'" %
-> 1843                                  (type(self).__name__, name))
   1844 
   1845     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'Features'



In [ ]:



In [503]:

    
for i in range(10): 
    sys.stdout.write('{0}..'.format(i)) 
    sys.stdout.flush() 
    time.sleep(.1)









    



0..1..2..3..4..5..6..7..8..9..



In [6]:

    
# Replace with index watcher
# A quick look at the original finanical spreadsheet shows TOTAL at the bottom 
# totaling all entries for everyone. This is obviously an outlier with no 
# meaningful information and can be removed.

# df[df['salary'] > 1000000]
# df[df.index == 'TOTAL']
df = df.drop('TOTAL', axis=0)



In [ ]:

By default, the GridSearchCV uses a 3-fold cross-validation. However, if it detects that a classifier is passed, rather than a regressor, it uses a stratified 3-fold.

http://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html

Remove columns with less than 50% of entries present.

Remove rows with no non-NA values



In [7]:

    
# low_var_remover = VarianceThreshold(threshold=.5)



In [8]:

    
# ************************
# Encode as 0 instead.
# Remove columns with more than 50% NA's
# df_50 = df.dropna(axis=1, thresh=len(df)/2)
# ************************

# Since email_address and poi are True/False, every record should have at least 2 non-NA.
# We'll next remove any rows that don't have at least 2 non-NA values besides these.
# The criteria is: No more than 11 NA's per row.
# df_50 = df_50.dropna(axis=0, thresh=5)

# 128 records remain.
# df_50.info()

Financial NA's

When looking at the source of the data, the NA entries in the financial data seem values that are reported as zero since all payments/stock values add up to the total payments/stocks values. These NA values should then be set to 0 to add up to the totals reported by the accounting spreadsheet.

Email statistics NA's

The missing values for NA's for email statistics may be a little more subjective.

Some email statistics are features created with prior knowledge of the entire dataset (i.e. emails to/from poi's). This may be data snooping, since if new data/pois were somehow introduced, it would not be possible to generate these features without prior knowledge of which new data were the poi's.
NA's here imply that the person did not have an email account with Enron, or were not involved in emailing by some other way.

This means all email data features ar NA if even one column had missing email data for that person. It is hard to judge any distribution that they could have if they were given an email account since they have no ties to the financial data to infer distributions.

We have no real way to infer a person having sent/recieved 10 emails or 10,000 from completely unrelated financial data from a different dataset with many different people.

For this reason, these NA will also be encoded as 0.



In [9]:

    
df = df.apply(lambda x: x.fillna(0), axis=0)



In [ ]:

Imputation



In [570]:

    
import seaborn as sns
sns.set(style='darkgrid')

f, ax = plt.subplots(figsize=(14, 14))
cmap = sns.diverging_palette(10, 220, as_cmap=True)
sns.corrplot(df.corr(), annot=True, sig_stars=False,
             diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()



In [587]:

    
corrs = df.corr()
corrs.sort(['poi'], ascending=False)['poi']









    Out[587]:





poi                                                   1.000000
ex_stock_squared                                      0.387501
total_stock_value                                     0.372603
exercised_stock_options                               0.371336
bonus                                                 0.360262
bonus_total_pay_ratio                                 0.359778
salary                                                0.341365
total_compensation                                    0.329207
long_term_incentive_total_pay_ratio                   0.300332
long_term_incentive                                   0.258301
total_payments                                        0.249394
restricted_stock                                      0.244578
total_poi_interaction                                 0.242457
shared_receipt_with_poi                               0.242105
loan_advances                                         0.220405
bonus_total_compensation_ratio                        0.214250
shared_poi_from_messages_ratio                        0.199026
expenses                                              0.195503
from_poi_to_this_person                               0.191549
loan_advances_total_pay_ratio                         0.191546
from_messages_from_poi_ratio                          0.188969
total_active_poi_interaction                          0.184635
bonus_by_total_stock                                  0.183942
loan_advances_total_compensation_ratio                0.177439
from_poi_total_active_poi_ratio                       0.172996
other                                                 0.170443
to_messages_to_poi_ratio                              0.169767
salary_total_pay_ratio                                0.140007
shared_receipt_with_poi_total_poi_int_ratio           0.133709
from_this_person_to_poi                               0.130319
long_term_incentive_total_compensation_ratio          0.120949
to_messages                                           0.110006
from_poi_to_this_person_total_poi_int_ratio           0.101335
from_this_person_to_poi_total_poi_int_ratio           0.094211
restricted_stock_total_stock_ratio                    0.090765
to_poi_total_active_poi_ratio                         0.089748
other_total_pay_ratio                                 0.072171
restricted_stock_deferred                             0.071629
other_total_compensation_ratio                        0.013383
salary_total_compensation_ratio                      -0.002503
shared_poi_total_compensation                        -0.005917
exercised_stock_options_total_compensation_ratio     -0.013675
restricted_stock_total_compensation_ratio            -0.015168
exercised_stock_options_total_stock_ratio            -0.017560
expenses_total_pay_ratio                             -0.028187
restricted_stock_deferred_total_compensation_ratio   -0.031373
from_messages                                        -0.033302
deferral_payments                                    -0.038635
director_fees_total_compensation_ratio               -0.039031
director_fees_total_pay_ratio                        -0.039176
deferral_payments_total_compensation_ratio           -0.085954
expenses_total_compensation_ratio                    -0.090538
deferral_payments_total_pay_ratio                    -0.109995
director_fees                                        -0.120144
deferred_income                                      -0.274998
deferred_income_total_pay_ratio                            NaN
restricted_stock_deferred_total_stock_ratio                NaN
deferred_income_total_compensation_ratio                   NaN
Name: poi, Length: 58, dtype: float64



In [11]:

    
# Pick a column which we are predicting.
# Find other variables correlated to used KMeansNeighborsRegression to predict/impute
# the missing values.
# df_50.corr().ix[: ,'salary']



In [12]:

    
# cols1 = ['salary', 'other', 'total_stock_value', 'exercised_stock_options', 
#        'total_payments', 'restricted_stock']
# Bonus and salary values don't seem to be missing at random. Anytime there is a null value
# for salary, there is also one for bonus. So bonus can't be used to predict salary on
# the first pass. Predicted salary values will be used to predict bonus values though 
# on a second pass.
# cols2= ['salary', 'other', 'total_stock_value', 'exercised_stock_options', 
#        'total_payments', 'restricted_stock', 'bonus']
# cols3 = ['to_messages', 'from_this_person_to_poi', 'from_messages', 
# 'shared_receipt_with_poi', 'from_poi_to_this_person']



In [13]:

    
def kcluster_null(df=None, cols=None, process_all=True):
    '''
    Input: Takes pandas dataframe with values to impute, and a list of columns to impute
        and use for imputing.
    Returns: Pandas dataframe with null values imputed for list of columns passed in.
    
    # Ideally columns should be somewhat correlated since they will be used in KNN to
    # predict each other, one column at a time.
    
    '''
    
    # Create a KNN regression estimator for 
    income_imputer = KNeighborsRegressor(n_neighbors=1)
    # Loops through the columns passed in to impute each one sequentially.
    
    if not process_all:
        to_pred = cols[0]
        predictor_cols = cols[1:]
        
        
    for each in cols:
        # Create a temp list that does not include the column being predicted.
        temp_cols = [col for col in cols if col != each]
        # Create a dataframe that contains no missing values in the columns being predicted.
        # This will be used to train the KNN estimator.
        df_col = df[df[each].isnull()==False]
        
        # Create a dataframe with all of the nulls in the column being predicted.
        df_null_col = df[df[each].isnull()==True]
        
        # Create a temp dataframe filling in the medians for each column being used to
        # predict that is missing values.
        # This step is needed since we have so many missing values distributed through 
        # all of the columns.
        temp_df_medians = df_col[temp_cols].apply(lambda x: x.fillna(x.median()), axis=0)
        
        # Fit our KNN imputer to this dataframe now that we have values for every column.
        income_imputer.fit(temp_df_medians, df_col[each])
        
        # Fill the df (that has null values being predicted) with medians in the other
        # columns not being predicted.
        # ** This currently uses its own medians and should ideally use the predictor df's
        # ** median values to fill in NA's of columns being used to predict.
        temp_null_medians = df_null_col[temp_cols].apply(lambda x: x.fillna(x.median()), axis=0)
        
        # Predict the null values for the current 'each' variable.
        new_values = income_imputer.predict(temp_null_medians[temp_cols])

        # Replace the null values of the original null dataframe with the predicted values.
        df_null_col[each] = new_values
        
        # Append the new predicted nulls dataframe to the dataframe which containined
        # no null values.
        # Overwrite the original df with this one containing predicted columns. 
        # Index order will not be preserved since it is rearranging each time by 
        # null values.
        df = df_col.append(df_null_col)
        
    # Returned final dataframe sorted by the index names.
    return df.sort_index(axis=0)



In [ ]:



In [14]:

    
df.irow(127)









    Out[14]:





salary                            0
to_messages                       0
deferral_payments                 0
total_payments               362096
exercised_stock_options           0
bonus                             0
restricted_stock                  0
shared_receipt_with_poi           0
restricted_stock_deferred         0
total_stock_value                 0
expenses                          0
loan_advances                     0
from_messages                     0
other                        362096
from_this_person_to_poi           0
poi                           False
director_fees                     0
deferred_income                   0
long_term_incentive               0
email_address                 False
from_poi_to_this_person           0
Name: THE TRAVEL AGENCY IN THE PARK, dtype: object



In [15]:

    
#cols = [x for x in df.columns]
#for each in cols:
#    g = sns.FacetGrid(df, col='poi', margin_titles=True, size=6)
#    g.map(plt.hist, each, color='steelblue')



In [16]:

    
from pandas.tools.plotting import scatter_matrix



In [17]:

    
list(df.columns)









    Out[17]:





['salary',
 'to_messages',
 'deferral_payments',
 'total_payments',
 'exercised_stock_options',
 'bonus',
 'restricted_stock',
 'shared_receipt_with_poi',
 'restricted_stock_deferred',
 'total_stock_value',
 'expenses',
 'loan_advances',
 'from_messages',
 'other',
 'from_this_person_to_poi',
 'poi',
 'director_fees',
 'deferred_income',
 'long_term_incentive',
 'email_address',
 'from_poi_to_this_person']



In [18]:

    
financial_cols = np.array(['salary', 'deferral_payments', 'total_payments', 'exercised_stock_options', 
                  'bonus', 'restricted_stock', 'restricted_stock_deferred', 'total_stock_value',
                  'expenses', 'loan_advances', 'other', 'director_fees', 'deferred_income', 
                  'long_term_incentive'])

email_cols = np.array(['from_messages', 'to_messages', 'shared_receipt_with_poi', 
              'from_this_person_to_poi', 'from_poi_to_this_person', 'email_address'])



In [19]:

    
from sklearn.ensemble import RandomForestClassifier



In [20]:

    
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(df[financial_cols], df['poi'])









    Out[20]:





RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)



In [21]:

    
importances = clf.feature_importances_
sorted_idx = np.argsort(importances)



In [22]:

    
padding = np.arange(len(financial_cols)) + 0.5
plt.figure(figsize=(14, 12))
plt.barh(padding, importances[sorted_idx], align='center')
plt.yticks(padding, financial_cols[sorted_idx])
plt.xlabel("Relative Importance")
plt.title("Variable Importance")
plt.show()



In [23]:

    
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(df[email_cols], df['poi'])

importances = clf.feature_importances_
sorted_idx = np.argsort(importances)

padding = np.arange(len(email_cols)) + 0.5
plt.figure(figsize=(14, 12))
plt.barh(padding, importances[sorted_idx], align='center')
plt.yticks(padding, email_cols[sorted_idx])
plt.xlabel("Relative Importance")
plt.title("Variable Importance")
plt.show()



In [24]:

    
all_cols = np.concatenate([email_cols, financial_cols])
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(df[all_cols], df['poi'])

importances = clf.feature_importances_
sorted_idx = np.argsort(importances)

padding = np.arange(len(all_cols)) + 0.5
plt.figure(figsize=(14, 12))
plt.barh(padding, importances[sorted_idx], align='center')
plt.yticks(padding, all_cols[sorted_idx])
plt.xlabel("Relative Importance")
plt.title("Variable Importance")
plt.show()



In [25]:

    
df['ex_stock_bins'] = pd.cut(df.exercised_stock_options, bins=15, labels=False)
pd.value_counts(df.ex_stock_bins)









    Out[25]:





0     118
1      10
2       6
3       4
8       2
6       2
14      1
13      1
4       1
dtype: int64



In [26]:

    
df.exercised_stock_options.plot()









    Out[26]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a277b70>



In [27]:

    
def capValues(x, cap):
    return (cap if x > cap else x)



In [28]:

    
df.exercised_stock_options = df.exercised_stock_options.apply(lambda x: capValues(x, 5000000))



In [29]:

    
df['ex_stock_bins'] = pd.cut(df.exercised_stock_options, bins=15, labels=False)
pd.value_counts(df.ex_stock_bins)









    Out[29]:





0     60
1     18
14    17
2     13
4     12
6      7
3      5
5      4
12     3
13     2
9      2
7      2
dtype: int64



In [30]:

    
df[['ex_stock_bins', 'poi']].groupby('ex_stock_bins').mean().plot()









    Out[30]:





<matplotlib.axes._subplots.AxesSubplot at 0x187cbda0>



In [31]:

    
df.columns









    Out[31]:





Index([u'salary', u'to_messages', u'deferral_payments', u'total_payments', u'exercised_stock_options', u'bonus', u'restricted_stock', u'shared_receipt_with_poi', u'restricted_stock_deferred', u'total_stock_value', u'expenses', u'loan_advances', u'from_messages', u'other', u'from_this_person_to_poi', u'poi', u'director_fees', u'deferred_income', u'long_term_incentive', u'email_address', u'from_poi_to_this_person', u'ex_stock_bins'], dtype='object')



In [32]:

    
df[['bonus', 'poi']].groupby('bonus').mean().plot()









    Out[32]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a622438>



In [33]:

    
df.shared_receipt_with_poi.plot()









    Out[33]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a5990f0>



In [34]:

    
max(df.shared_receipt_with_poi)









    Out[34]:





5521.0



In [35]:

    
# Create bins for shared receipt with poi
my_bins = [min(df.shared_receipt_with_poi)] + [250] + range(500, 5000, 500) + [max(df.shared_receipt_with_poi)]
df['shared_poi_bins'] = pd.cut(df.shared_receipt_with_poi, bins=my_bins, labels=False, include_lowest=True)
pd.value_counts(df['shared_poi_bins'])









    Out[35]:





0     81
2     19
5     11
3      9
1      9
4      6
8      4
6      4
10     2
dtype: int64



In [ ]:



In [36]:

    
df[['shared_poi_bins', 'poi']].groupby('shared_poi_bins').mean().plot()









    Out[36]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a80f400>



In [37]:

    
df.total_stock_value









    Out[37]:





ALLEN PHILLIP K          1729541
BADUM JAMES P             257817
BANNANTINE JAMES M       5243487
BAXTER JOHN C           10623258
BAY FRANKLIN R             63014
BAZELIDES PHILIP J       1599641
BECK SALLY W              126027
BELDEN TIMOTHY N         1110705
BELFER ROBERT                  0
BERBERIAN DAVID          2493616
BERGSIEKER RICHARD P      659249
BHATNAGAR SANJAY        15456290
BIBI PHILIPPE A          1843816
BLACHMAN JEREMY M         954354
BLAKE JR. NORMAN P             0
...
UMANOFF ADAM S                  0
URQUHART JOHN A                 0
WAKEHAM JOHN                    0
WALLS JR ROBERT H         5898997
WALTERS GARETH W          1030329
WASAFF GEORGE             2056427
WESTFAHL RICHARD K         384930
WHALEY DAVID A              98718
WHALLEY LAWRENCE G        6079137
WHITE JR THOMAS E        15144123
WINOKUR JR. HERBERT S           0
WODRASKA JOHN                   0
WROBEL BRUCE               139130
YEAGER F SCOTT           11884758
YEAP SOON                  192758
Name: total_stock_value, Length: 145, dtype: float64



In [ ]:



In [38]:

    
from sklearn.preprocessing import StandardScaler

df['total_stock_scaled'] = StandardScaler().fit_transform(df[['total_stock_value']])
df['bonus_scaled'] = StandardScaler().fit_transform(df[['bonus']])

print df.total_stock_scaled.describe() plt.hist(df.total_stock_scaled)



In [39]:

    
def dont_neg_log(x):
    if x >=0:
        return np.log1p(x)
    else:
        return 0
    
df['stock_log'] = df['total_stock_value'].apply(lambda x: dont_neg_log(x))

Feature Ratio Creation



In [561]:

    
financial_cols = np.array(['salary', 'deferral_payments', 'total_payments', 'exercised_stock_options', 
                  'bonus', 'restricted_stock', 'restricted_stock_deferred', 'total_stock_value',
                  'expenses', 'loan_advances', 'other', 'director_fees', 'deferred_income', 
                  'long_term_incentive'])

email_cols = np.array(['from_messages', 'to_messages', 'shared_receipt_with_poi', 
              'from_this_person_to_poi', 'from_poi_to_this_person', 'email_address'])



In [562]:

    
payment_comp = ['salary', 'deferral_payments','bonus', 'expenses', 'loan_advances',
                'other', 'director_fees', 'deferred_income', 'long_term_incentive']
payment_total = ['total_payments']

stock_comp = ['exercised_stock_options', 'restricted_stock','restricted_stock_deferred',]
stock_total = ['total_stock_value']

all_comp = payment_comp + stock_comp

email_comp = ['shared_receipt_with_poi', 'from_this_person_to_poi', 'from_poi_to_this_person' ]
email_totals = ['from_messages', 'to_messages'] # interaction_w_poi = total(from/to/shared poi)



In [ ]:



In [636]:

    
df['total_compensation'] = df['total_payments'] + df['total_stock_value']

for each in payment_comp:
    df['{0}_{1}_ratio'.format(each, 'total_pay')] = df[each]/df['total_payments']

for each in stock_comp:
    df['{0}_{1}_ratio'.format(each, 'total_stock')] = df[each]/df['total_stock_value']

for each in all_comp:
    df['{0}_{1}_ratio'.format(each, 'total_compensation')] = df[each]/df['total_compensation']

    
df['total_poi_interaction'] = df['shared_receipt_with_poi'] + df['from_this_person_to_poi'] + \
df['from_poi_to_this_person']

for each in email_comp:
    df['{0}_{1}_ratio'.format(each, 'total_poi_int')] = df[each]/df['total_poi_interaction']

df['total_active_poi_interaction'] = df['from_this_person_to_poi'] + df['from_poi_to_this_person']
df['to_poi_total_active_poi_ratio'] = df['from_this_person_to_poi']/df['total_active_poi_interaction']
df['from_poi_total_active_poi_ratio'] = df['from_poi_to_this_person']/df['total_active_poi_interaction']

df['to_messages_to_poi_ratio'] = df['from_this_person_to_poi']/ df['to_messages']
df['from_messages_from_poi_ratio'] = df['from_poi_to_this_person']/df['from_messages']
df['shared_poi_from_messages_ratio'] = df['shared_receipt_with_poi']/df['from_messages']
df['shared_poi_total_compensation'] = df['shared_receipt_with_poi']/df['total_compensation']
df['bonus_by_total_stock'] = df['bonus']/df['total_stock_value']

## Add squared features
for each in all_comp:
    df['{0}_squared'.format(each)] = df[each]**2
    
for each in email_comp:
    df['{0}_squared'.format(each)] = df[each]**2

A good portion of people were paid either only in stock or payments. Another good portion also didn't have email statistics available.

These ratios will need to be set to zero manually due to division by 0 - NaN.



In [640]:

    
df = df.apply(lambda x: x.fillna(0), axis=0)



In [644]:









    



<class 'pandas.core.frame.DataFrame'>
Index: 145 entries, ALLEN PHILLIP K to YEAP SOON
Data columns (total 73 columns):
salary                                                145 non-null int64
to_messages                                           145 non-null int64
deferral_payments                                     145 non-null int64
total_payments                                        145 non-null int64
exercised_stock_options                               145 non-null int64
bonus                                                 145 non-null int64
restricted_stock                                      145 non-null int64
shared_receipt_with_poi                               145 non-null int64
restricted_stock_deferred                             145 non-null int64
total_stock_value                                     145 non-null int64
expenses                                              145 non-null int64
loan_advances                                         145 non-null int64
from_messages                                         145 non-null int64
other                                                 145 non-null int64
from_this_person_to_poi                               145 non-null int64
poi                                                   145 non-null bool
director_fees                                         145 non-null int64
deferred_income                                       145 non-null int64
long_term_incentive                                   145 non-null int64
from_poi_to_this_person                               145 non-null int64
total_compensation                                    145 non-null int64
salary_total_pay_ratio                                145 non-null float64
deferral_payments_total_pay_ratio                     145 non-null float64
bonus_total_pay_ratio                                 145 non-null float64
expenses_total_pay_ratio                              145 non-null float64
loan_advances_total_pay_ratio                         145 non-null float64
other_total_pay_ratio                                 145 non-null float64
director_fees_total_pay_ratio                         145 non-null float64
deferred_income_total_pay_ratio                       145 non-null float64
long_term_incentive_total_pay_ratio                   145 non-null float64
exercised_stock_options_total_stock_ratio             145 non-null float64
restricted_stock_total_stock_ratio                    145 non-null float64
restricted_stock_deferred_total_stock_ratio           145 non-null float64
salary_total_compensation_ratio                       145 non-null float64
deferral_payments_total_compensation_ratio            145 non-null float64
bonus_total_compensation_ratio                        145 non-null float64
expenses_total_compensation_ratio                     145 non-null float64
loan_advances_total_compensation_ratio                145 non-null float64
other_total_compensation_ratio                        145 non-null float64
director_fees_total_compensation_ratio                145 non-null float64
deferred_income_total_compensation_ratio              145 non-null float64
long_term_incentive_total_compensation_ratio          145 non-null float64
exercised_stock_options_total_compensation_ratio      145 non-null float64
restricted_stock_total_compensation_ratio             145 non-null float64
restricted_stock_deferred_total_compensation_ratio    145 non-null float64
total_poi_interaction                                 145 non-null int64
shared_receipt_with_poi_total_poi_int_ratio           145 non-null float64
from_this_person_to_poi_total_poi_int_ratio           145 non-null float64
from_poi_to_this_person_total_poi_int_ratio           145 non-null float64
total_active_poi_interaction                          145 non-null int64
to_poi_total_active_poi_ratio                         145 non-null float64
from_poi_total_active_poi_ratio                       145 non-null float64
to_messages_to_poi_ratio                              145 non-null float64
from_messages_from_poi_ratio                          145 non-null float64
shared_poi_from_messages_ratio                        145 non-null float64
shared_poi_total_compensation                         145 non-null float64
bonus_by_total_stock                                  145 non-null float64
ex_stock_squared                                      145 non-null int64
salary_squared                                        145 non-null int64
deferral_payments_squared                             145 non-null int64
bonus_squared                                         145 non-null int64
expenses_squared                                      145 non-null int64
loan_advances_squared                                 145 non-null int64
other_squared                                         145 non-null int64
director_fees_squared                                 145 non-null int64
deferred_income_squared                               145 non-null int64
long_term_incentive_squared                           145 non-null int64
exercised_stock_options_squared                       145 non-null int64
restricted_stock_squared                              145 non-null int64
restricted_stock_deferred_squared                     145 non-null int64
shared_receipt_with_poi_squared                       145 non-null int64
from_this_person_to_poi_squared                       145 non-null int64
from_poi_to_this_person_squared                       145 non-null int64
dtypes: bool(1), float64(34), int64(38)



In [565]:

    
df[['poi', 'director_fees_total_pay_ratio', 'director_fees', 'total_payments']]









    Out[565]:






  
    
      
      poi
      director_fees_total_pay_ratio
      director_fees
      total_payments
    
  
  
    
      ALLEN PHILLIP K
       False
        0.000000
            0
       4484442
    
    
      BADUM JAMES P
       False
        0.000000
            0
        182466
    
    
      BANNANTINE JAMES M
       False
        0.000000
            0
        916197
    
    
      BAXTER JOHN C
       False
        0.000000
            0
       5634343
    
    
      BAY FRANKLIN R
       False
        0.000000
            0
        827696
    
    
      BAZELIDES PHILIP J
       False
        0.000000
            0
        860136
    
    
      BECK SALLY W
       False
        0.000000
            0
        969068
    
    
      BELDEN TIMOTHY N
        True
        0.000000
            0
       5501630
    
    
      BELFER ROBERT
       False
       31.202435
       102500
          3285
    
    
      BERBERIAN DAVID
       False
        0.000000
            0
        228474
    
    
      BERGSIEKER RICHARD P
       False
        0.000000
            0
        618850
    
    
      BHATNAGAR SANJAY
       False
        0.000000
            0
        137864
    
    
      BIBI PHILIPPE A
       False
        0.000000
            0
       2047593
    
    
      BLACHMAN JEREMY M
       False
        0.000000
            0
       2014835
    
    
      BLAKE JR. NORMAN P
       False
       88.963253
       113784
          1279
    
    
      BOWEN JR RAYMOND M
        True
        0.000000
            0
       2669589
    
    
      BROWN MICHAEL
       False
        0.000000
            0
         49288
    
    
      BUCHANAN HAROLD G
       False
        0.000000
            0
       1054637
    
    
      BUTTS ROBERT H
       False
        0.000000
            0
       1271582
    
    
      BUY RICHARD B
       False
        0.000000
            0
       2355702
    
    
      CALGER CHRISTOPHER F
        True
        0.000000
            0
       1639297
    
    
      CARTER REBECCA C
       False
        0.000000
            0
        477557
    
    
      CAUSEY RICHARD A
        True
        0.000000
            0
       1868758
    
    
      CHAN RONNIE
       False
             inf
        98784
             0
    
    
      CHRISTODOULOU DIOMEDES
       False
             inf
            0
             0
    
    
      CLINE KENNETH W
       False
             inf
            0
             0
    
    
      COLWELL WESLEY
        True
        0.000000
            0
       1490344
    
    
      CORDES WILLIAM R
       False
             inf
            0
             0
    
    
      COX DAVID
       False
        0.000000
            0
       1101393
    
    
      CUMBERLAND MICHAEL S
       False
        0.000000
            0
        807956
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      SCRIMSHAW MATTHEW
       False
             inf
            0
             0
    
    
      SHANKMAN JEFFREY A
       False
        0.000000
            0
       3038702
    
    
      SHAPIRO RICHARD S
       False
        0.000000
            0
       1057548
    
    
      SHARP VICTORIA T
       False
        0.000000
            0
       1576511
    
    
      SHELBY REX
        True
        0.000000
            0
       2003885
    
    
      SHERRICK JEFFREY B
       False
             inf
            0
             0
    
    
      SHERRIFF JOHN R
       False
        0.000000
            0
       4335388
    
    
      SKILLING JEFFREY K
        True
        0.000000
            0
       8682716
    
    
      STABLER FRANK
       False
        0.000000
            0
       1112087
    
    
      SULLIVAN-SHAKLOVITZ COLLEEN
       False
        0.000000
            0
        999356
    
    
      SUNDE MARTIN
       False
        0.000000
            0
       1545059
    
    
      TAYLOR MITCHELL S
       False
        0.000000
            0
       1092663
    
    
      THE TRAVEL AGENCY IN THE PARK
       False
        0.000000
            0
        362096
    
    
      THORN TERENCE H
       False
        0.000000
            0
        911453
    
    
      TILNEY ELIZABETH A
       False
        0.000000
            0
        399393
    
    
      UMANOFF ADAM S
       False
        0.000000
            0
       1130461
    
    
      URQUHART JOHN A
       False
        0.160354
        36666
        228656
    
    
      WAKEHAM JOHN
       False
        0.512965
       109298
        213071
    
    
      WALLS JR ROBERT H
       False
        0.000000
            0
       1798780
    
    
      WALTERS GARETH W
       False
        0.000000
            0
         87410
    
    
      WASAFF GEORGE
       False
        0.000000
            0
       1034395
    
    
      WESTFAHL RICHARD K
       False
        0.000000
            0
        762135
    
    
      WHALEY DAVID A
       False
             inf
            0
             0
    
    
      WHALLEY LAWRENCE G
       False
        0.000000
            0
       4677574
    
    
      WHITE JR THOMAS E
       False
        0.000000
            0
       1934359
    
    
      WINOKUR JR. HERBERT S
       False
        1.277520
       108579
         84992
    
    
      WODRASKA JOHN
       False
        0.000000
            0
        189583
    
    
      WROBEL BRUCE
       False
             inf
            0
             0
    
    
      YEAGER F SCOTT
        True
        0.000000
            0
        360300
    
    
      YEAP SOON
       False
        0.000000
            0
         55097
    
  

145 rows × 4 columns



In [566]:

    
df[df['poi']==True]









    Out[566]:






  
    
      
      salary
      to_messages
      deferral_payments
      total_payments
      exercised_stock_options
      bonus
      restricted_stock
      shared_receipt_with_poi
      restricted_stock_deferred
      total_stock_value
      ...
      from_this_person_to_poi_total_poi_int_ratio
      from_poi_to_this_person_total_poi_int_ratio
      total_active_poi_interaction
      to_poi_total_active_poi_ratio
      from_poi_total_active_poi_ratio
      to_messages_to_poi_ratio
      from_messages_from_poi_ratio
      shared_poi_from_messages_ratio
      shared_poi_total_compensation
      bonus_by_total_stock
    
  
  
    
      BELDEN TIMOTHY N
        213999
       7991
       2144013
         5501630
         953136
       5249999
         157569
       5521
       0
        1110705
      ...
       0.018439
       0.038928
       336
       0.321429
       0.678571
       0.013515
       0.471074
       11.407025
       0.000835
       4.726727
    
    
      BOWEN JR RAYMOND M
        278601
       1858
             0
         2669589
              0
       1350000
         252055
       1593
       0
         252055
      ...
       0.008581
       0.080092
       155
       0.096774
       0.903226
       0.008073
       5.185185
       59.000000
       0.000545
       5.355974
    
    
      CALGER CHRISTOPHER F
        240189
       2598
             0
         1639297
              0
       1250000
         126027
       2188
       0
         126027
      ...
       0.010365
       0.082504
       224
       0.111607
       0.888393
       0.009623
       1.381944
       15.194444
       0.001239
       9.918510
    
    
      CAUSEY RICHARD A
        415189
       1892
             0
         1868758
              0
       1000000
        2502063
       1585
       0
        2502063
      ...
       0.007251
       0.035045
        70
       0.171429
       0.828571
       0.006342
       1.183673
       32.346939
       0.000363
       0.399670
    
    
      COLWELL WESLEY
        288542
       1758
         27610
         1490344
              0
       1200000
         698242
       1132
       0
         698242
      ...
       0.007954
       0.173536
       251
       0.043825
       0.956175
       0.006257
       6.000000
       28.300000
       0.000517
       1.718602
    
    
      DELAINEY DAVID W
        365163
       3093
             0
         4747979
        2291113
       3000000
        1323148
       2097
       0
        3614261
      ...
       0.219697
       0.023810
       675
       0.902222
       0.097778
       0.196896
       0.021505
        0.683284
       0.000251
       0.830045
    
    
      FASTOW ANDREW S
        440698
          0
             0
         2424083
              0
       1300000
        1794412
          0
       0
        1794412
      ...
            inf
            inf
         0
            inf
            inf
            inf
            inf
             inf
       0.000000
       0.724471
    
    
      GLISAN JR BEN F
        274975
        873
             0
         1272284
         384728
        600000
         393818
        874
       0
         778546
      ...
       0.006438
       0.055794
        58
       0.103448
       0.896552
       0.006873
       3.250000
       54.625000
       0.000426
       0.770667
    
    
      HANNON KEVIN P
        243293
       1045
             0
          288682
        5538001
       1500000
         853064
       1035
       0
        6391065
      ...
       0.019301
       0.029412
        53
       0.396226
       0.603774
       0.020096
       1.000000
       32.343750
       0.000155
       0.234703
    
    
      HIRKO JOSEPH
             0
          0
         10259
           91093
       30766064
             0
              0
          0
       0
       30766064
      ...
            inf
            inf
         0
            inf
            inf
            inf
            inf
             inf
       0.000000
       0.000000
    
    
      KOENIG MARK E
        309946
       2374
             0
         1587421
         671737
        700000
        1248318
       2271
       0
        1920055
      ...
       0.006413
       0.022659
        68
       0.220588
       0.779412
       0.006318
       0.868852
       37.229508
       0.000647
       0.364573
    
    
      KOPPER MICHAEL J
        224305
          0
             0
         2652612
              0
        800000
         985032
          0
       0
         985032
      ...
            inf
            inf
         0
            inf
            inf
            inf
            inf
             inf
       0.000000
       0.812156
    
    
      LAY KENNETH L
       1072321
       4273
        202911
       103559793
       34348384
       7000000
       14761694
       2411
       0
       49110078
      ...
       0.006275
       0.048235
       139
       0.115108
       0.884892
       0.003744
       3.416667
       66.972222
       0.000016
       0.142537
    
    
      RICE KENNETH D
        420636
        905
             0
          505050
       19794175
       1750000
        2748364
        864
       0
       22542539
      ...
       0.004396
       0.046154
        46
       0.086957
       0.913043
       0.004420
       2.333333
       48.000000
       0.000037
       0.077631
    
    
      RIEKER PAULA H
        249201
       1328
        214678
         1099100
        1635238
        700000
         283649
       1258
       0
        1918887
      ...
       0.035794
       0.026100
        83
       0.578313
       0.421687
       0.036145
       0.426829
       15.341463
       0.000417
       0.364795
    
    
      SHELBY REX
        211844
        225
             0
         2003885
        1624396
        200000
         869220
         91
       0
        2493616
      ...
       0.118644
       0.110169
        27
       0.518519
       0.481481
       0.062222
       0.333333
        2.333333
       0.000020
       0.080205
    
    
      SKILLING JEFFREY K
       1111258
       3627
             0
         8682716
       19250000
       5600000
        6843672
       2042
       0
       26093672
      ...
       0.013889
       0.040741
       118
       0.254237
       0.745763
       0.008271
       0.814815
       18.907407
       0.000059
       0.214611
    
    
      YEAGER F SCOTT
        158403
          0
             0
          360300
        8308552
             0
        3576206
          0
       0
       11884758
      ...
            inf
            inf
         0
            inf
            inf
            inf
            inf
             inf
       0.000000
       0.000000
    
  

18 rows × 57 columns

director_fees_total_pay_ratio, deferred_income_total_pay_ratio, exercised_stock_options_total_stock_ratio, exercised_stock_options_total_stock_ratio, restricted_stock_deferred_total_stock_ratio, restricted_stock_total_stock_ratio, director_fees_total_compensation_ratio, deferred_income_total_compensation_ratio, restricted_stock_total_compensation_ratio, restricted_stock_deferred_total_compensation_ratio

Replace Inf/-Inf created in pandas from dividing a -/+ number by zero



In [647]:

    
df = df.replace([np.inf, -np.inf], 0)



In [47]:

    
#df.ix[20:30, 30:40]



In [48]:

    
# Column/row slicing by number
# df.ix[11,:]



In [796]:

    
#all_cols2 = np.concatenate([all_cols, np.array(['shared_poi_bins', 'ex_stock_bins', 
#                                                'total_stock_scaled', 'bonus_scaled',
#                                                'stock_log'])])
# from_messages_from_poi_to_ratio

features = np.array(df.drop('poi', axis=1).columns)

clf = ExtraTreesClassifier(n_estimators=3000)
clf.fit(df[features], df['poi'])

importances = clf.feature_importances_
sorted_idx = np.argsort(importances)

padding = np.arange(len(features)) + 0.5
plt.figure(figsize=(16,14))
plt.barh(padding, importances[sorted_idx], align='center')
plt.yticks(padding, features[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

top10_features_RF = ['bonus', 'total_stock_value', 'other', 'total_compensation', 'expenses',
                 'other_total_pay_ratio', 'from_messages_from_poi_ratio', 'restricted_stock',
                 'shared_poi_from_messages_ratio', 'total_payments']

top10_features_ET = ['exercised_stock_options_squared', 'total_stock_value', 'bonus_total_pay_ratio', 
                     'long_term_incentive_total_pay_ratio', 'bonus', 'deferred_income',
                     'total_compensation', 'to_messages_to_poi_ratio',
                     'from_messages_from_poi_ratio', 'to_messages_to_poi_ratio', 'other_total_pay_ratio',
                     'salary_squared', 'other']



In [ ]:



In [569]:

    
confusion_matrix(df['poi'], clf.predict(df[features]))









    Out[569]:





array([[127,   0],
       [  0,  18]])



In [206]:

    
#X_df = df.drop('poi', axis=1)
#y_df = df['poi']
#selector = SelectKBest(k=12, score_func=f_classif)
#selector = selector.fit_transform(X_df, y_df)
#selector









    Out[206]:





array([[201955.0, 4484442.0, 4175000.0, ..., 6213983.0, 0.9309965431596617,
        0.06796943744617502],
       [0.0, 182466.0, 0.0, ..., 440283.0, 0.0, 0.0],
       [477.0, 916197.0, 0.0, ..., 6159684.0, 0.0, 0.0],
       ..., 
       [0.0, 0.0, 0.0, ..., 139130.0, 0.0, 0.0],
       [158403.0, 360300.0, 0.0, ..., 12245058.0, 0.0, 0.0],
       [0.0, 55097.0, 0.0, ..., 247855.0, 0.0, 0.0]], dtype=object)

Train



In [ ]:

    
FINANCIAL_FIELDS = ['salary', 'deferral_payments', 'total_payments', 'exercised_stock_options', 
                  'bonus', 'restricted_stock', 'restricted_stock_deferred', 'total_stock_value',
                  'expenses', 'loan_advances', 'other', 'director_fees', 'deferred_income', 
                  'long_term_incentive', 'ex_stock_bins', 'stock_log']

EMAIL_FIELDS = ['from_messages', 'to_messages', 'shared_receipt_with_poi', 
              'from_this_person_to_poi', 'from_poi_to_this_person', 'email_address',
              'shared_poi_bins']



In [222]:

    
class ColumnExtractor(TransformerMixin):
    '''
    Columns extractor transformer for sklearn pipelines.
    Inherits fit_transform() from TransformerMixin, but this is explicitly
    defined here for clarity.
    
    Methods to extract pandas dataframe columns are defined for this class.
    
    '''
    def __init__(self, columns=[]):
        self.columns = columns
    
    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)
    
    def transform(self, X, **transform_params):
        '''
        Input: A pandas dataframe and a list of column names to extract.
        Output: A pandas dataframe containing only the columns of the names passed in.
        '''
        return X[self.columns]
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        """Get parameters for this estimator.
        Parameters
        ----------
        deep: boolean, optional
            If True, will return the parameters for this estimator and
            contained subobjects that are estimators.
        Returns
        -------
        params : mapping of string to any
            Parameter names mapped to their values.
        """

        return self



In [545]:

    
top10_features_ET
top10 = ['exercised_stock_options', 'total_stock_value', 'bonus', 'salary', 'deferred_income', 
        'long_term_incentive', 'restricted_stock', 'total_payments', 'loan_advances',
         'shared_receipt_with_poi','total_compensation', 'from_messages_from_poi_ratio']



In [ ]:



In [936]:

    
#X_df = df[['total_payments', 'total_stock_value', 'shared_receipt_with_poi', 'bonus']].astype(float)
X_df = df.drop('poi', axis=1).astype(float)
#X_df = df[top10_features_ET]

#X_df = df[topKBest].astype(float)
y_df = df['poi']

y_df = y_df[X_df.abs().sum(axis=1) != 0]
X_df = X_df[X_df.abs().sum(axis=1) != 0]


from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.lda import LDA
from sklearn.linear_model import Lars
from sklearn.linear_model import SGDClassifier
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report


sk_fold = StratifiedShuffleSplit(y_df, n_iter=100, test_size=0.1) 
        
pipeline = Pipeline(steps=[#('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy="median", verbose=0)),
                           #('standardizer', StandardScaler(copy=True, with_mean=True, with_std=True)),
                           ('minmaxer', MinMaxScaler()),
                           #('low_var_remover', VarianceThreshold()),
                           ('selection', SelectKBest(score_func=f_classif)),
                           ('reducer', PCA()),
                           #('classifier', LinearSVC(penalty='l1', dual=False)),
                           #('KMeans', KMeans(n_clusters=2))
                           ('classifier', LogisticRegression())
                           #('classifier2', SGDClassifier(n_iter=300))
                                                     ]) # ,
                           #('ET', ExtraTreesClassifier(bootstrap=True, compute_importances=None,
                           #                            criterion='gini', n_estimators=1500, n_jobs=1,
                           #                            oob_score=True, random_state=None, verbose=0,
                           #                            max_features='auto', min_samples_split=2,
                           #                            min_samples_leaf=1))])

                    
params = {
          #'ET__n_estimators': [1500],
          #'ET__max_features': ['auto', None, 3, 5, 10, 20],
          #'ET__min_samples_split': [2, 4, 10],
          #'ET__min_samples_leaf': [1, 2, 5],
          'selection__k': [20, 17, 15],
          'classifier__C': [1, 10, 100, 1000],
          #'classifier2__alpha': [0.0001, 0.001],
          #'classifier2__loss': ['hinge', 'log', 'modified_huber'],
          #'classifier2__class_weight': [{True: 4, False: 1}, {True: 10, False: 1}],
          #'classifier__penalty': ['l1', 'l2'],
          'classifier__class_weight': [{True: 12, False: 1}, {True: 10, False: 1}, {True: 8, False: 1}],
          'classifier__tol': [1e-1, 1e-2, 1e-4, 1e-8, 1e-16, 1e-32],
          'reducer__n_components': [1, 2, 3, 4, 5],
          'reducer__whiten': [True, False]
          #'feature_selection__k': [3, 5, 10, 20]
          #'ET__criterion' : ['gini', 'entropy'],
          #'imputer__strategy': ['median', 'mean'],
          #'low_var_remover__threshold': [0, 0.1, .25, .50, .75, .90, .99]
          }
# Scoring: average_precision, roc_auc, f1, recall, precision
grid_search = GridSearchCV(pipeline, param_grid=params, cv=sk_fold, n_jobs = 1, scoring='f1')
grid_search.fit(X_df, y=y_df)
#test_pred = grid_search.predict(X_test)
#print "Cross_Val_score: ", cross_val_score(grid_search, X_train, y_train)
print "Best Estimator: ", grid_search.best_estimator_
    #f1_avg.append(f1_score(y_test, test_pred))
#print "F1: ", f1_score(y_test, test_pred)
#print "Confusion Matrix: "
#print confusion_matrix(y_test, test_pred)
#print "Accuracy Score: ", accuracy_score(y_test, test_pred)
print "Best Params: ", grid_search.best_params_









    



Best Estimator:  Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('selection', SelectKBest(k=20, score_func=<function f_classif at 0x0000000016AA6C88>)), ('reducer', PCA(copy=True, n_components=1, whiten=True)), ('classifier', LogisticRegression(C=1, class_weight={False: 1, True: 10}, dual=False,
          fit_intercept=True, intercept_scaling=1, penalty='l2',
          random_state=None, tol=0.1))])
Best Params:  {'reducer__n_components': 1, 'classifier__class_weight': {False: 1, True: 10}, 'classifier__tol': 0.1, 'selection__k': 20, 'reducer__whiten': True, 'classifier__C': 1}



In [937]:

    
n_iter = 1000
sk_fold = StratifiedShuffleSplit(y_df, n_iter=n_iter, test_size=0.1)
f1_avg = []
recall_avg = []
precision_avg = []
for i, all_index in enumerate(sk_fold):
    train_index = all_index[0]
    test_index = all_index[1]
    X_train, X_test = X_df.irow(train_index), X_df.irow(test_index)
    y_train, y_test = y_df[train_index], y_df[test_index]

    grid_search.best_estimator_.fit(X_train, y=y_train)
    # pipeline.fit(X_train, y=y_train)
    test_pred = grid_search.predict(X_test)
    #test_pred = pipeline.predict(X_test)

    #print "Cross_Val_score: ", cross_val_score(grid_search, X_train, y_train)
    #print "Best Estimator: ", grid_search.best_estimator_
    #print f1_score(y_test, test_pred)
    if i % round(n_iter/10) == 0:
        sys.stdout.write('{0}%..'.format(float(i)/n_iter*100)) 
        sys.stdout.flush()        
    f1_avg.append(f1_score(y_test, test_pred))
    precision_avg.append(precision_score(y_test, test_pred))
    recall_avg.append(recall_score(y_test, test_pred))

print "Done!"
print ""
print "F1 Avg: ", sum(f1_avg)/n_iter
print "Precision Avg: ", sum(precision_avg)/n_iter
print "Recall Avg: ", sum(recall_avg)/n_iter









    



0.0%..10.0%..20.0%..30.0%..40.0%..50.0%..60.0%..70.0%..80.0%..90.0%..Done!

F1 Avg:  0.415665673216
Precision Avg:  0.299662842713
Recall Avg:  0.755



In [ ]:

F1 Avg: 0.309882173382 Precision Avg: 0.226065462315

Recall Avg: 0.5515

Best Estimator: Pipeline(steps=[('standardizer', StandardScaler(copy=True, with_mean=True, with_std=True)), ('low_var_remover', VarianceThreshold(threshold=0.1)), ('classifier', LinearSVC(C=0.1, class_weight='auto', dual=False, fit_intercept=True, intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l1', random_state=None, tol=1e-07, verbose=0))]) Best Params: {'classifierclass_weight': 'auto', 'low_var_removerthreshold': 0.1, 'classifierC': 0.1, 'classifiertol': 1e-07}

F1 Avg: 0.39108035853 Precision Avg: 0.263075613276

Recall Avg: 0.8335

Best Estimator: Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('reducer', PCA(copy=True, n_components=5, whiten=True)), ('classifier', LogisticRegression(C=0.01, class_weight='auto', dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.01))]) Best Params: {'reducerwhiten': True, 'classifierclass_weight': 'auto', 'classifierC': 0.01, 'reducern_components': 5, 'classifier__tol': 0.01}

F1 Avg: 0.408565806416 Precision Avg: 0.301739249639

Recall Avg: 0.725

Best Estimator: Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('reducer', PCA(copy=True, n_components=5, whiten=False)), ('classifier2', SGDClassifier(alpha=0.0001, class_weight='auto', epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=300, n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None, shuffle=False, verbose=0, warm_start=False))]) Best Params: {'reducern_components': 5, 'classifier2alpha': 0.0001, 'classifier2class_weight': 'auto', 'classifier2loss': 'hinge', 'reducerwhiten': False, 'classifier2penalty': 'elasticnet'}

F1 Avg: 0.293634931735 Precision Avg: 0.219107395382

Recall Avg: 0.5055

Best Estimator: Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('classifier', LinearSVC(C=1, class_weight='auto', dual=False, fit_intercept=True, intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l1', random_state=None, tol=1e-08, verbose=0)), ('classifier2', SGDClassifier(a..., penalty='l2', power_t=0.5, random_state=None, shuffle=False, verbose=0, warm_start=False))]) Best Params: {'classifier2alpha': 0.001, 'classifierclass_weight': 'auto', 'classifier2class_weight': 'auto', 'classifier2loss': 'hinge', 'classifiertol': 1e-08, 'classifier2penalty': 'l2', 'classifier__C': 1}

F1 Avg: 0.392249062049 Precision Avg: 0.300678174603

Recall Avg: 0.636

Best Estimator: Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('reducer', PCA(copy=True, n_components=4, whiten=True)), ('classifier', LogisticRegression(C=10, class_weight='auto', dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001))]) Best Params: {'reducern_components': 4, 'classifierclass_weight': 'auto', 'classifiertol': 0.0001, 'reducerwhiten': True, 'classifierC': 10, 'classifierpenalty': 'l2'}

F1 Avg: 0.461406277056 Precision Avg: 0.364574206349

Recall Avg: 0.7095

Best Estimator: Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('reducer', PCA(copy=True, n_components=1, whiten=True)), ('classifier', LogisticRegression(C=100, class_weight={False: 1, True: 8}, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.1))]) Best Params: {'reducerwhiten': True, 'classifierclass_weight': {False: 1, True: 8}, 'classifierC': 100, 'reducern_components': 1, 'classifier__tol': 0.1}



In [ ]:

    
pipeline = Pipeline(steps=[#('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)),
                           #('standardizer', StandardScaler(copy=True, with_mean=True, with_std=True)),
                           #('low_var_remover', VarianceThreshold(threshold=0.1)), 
                           #('feature_selection', LinearSVC()),
                           ('features', FeatureUnion([
                                ('financial', Pipeline([
                                    ('extract', ColumnExtractor(FINANCIAL_FIELDS)),
                                    ('scale', StandardScaler()),
                                    ('reduce', LinearSVC())
                                ])),

                                ('email', Pipeline([
                                    ('extract2', ColumnExtractor(EMAIL_FIELDS)),
                                    ('scale2', StandardScaler()),
                                    ('reduce2', LinearSVC())
                                ]))

                            ])),
                           ('ET', ExtraTreesClassifier(bootstrap=True, compute_importances=None,
                                                       criterion='gini', n_estimators=1500, n_jobs=1,
                                                       oob_score=True, random_state=None, verbose=0,
                                                       max_features=None, min_samples_split=2,
                                                       min_samples_leaf=1))
                            ])



In [938]:

    
PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\tFalse negatives: {:4d}\tTrue negatives: {:4d}"


def test_classifier(clf, dataset, feature_list, folds = 1000):
    #data = featureFormat(dataset, feature_list, sort_keys = True)
    #labels, features = targetFeatureSplit(data)
    labels = y_df
    features = X_df
    cv = StratifiedShuffleSplit(labels, n_iter=folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        #for ii in train_idx:
        #    features_train.append( features[ii] )
        #    labels_train.append( labels[ii] )
        #for jj in test_idx:
        #    features_test.append( features[jj] )
        #    labels_test.append( labels[jj] )
        features_train, features_test = features.irow(train_index), features.irow(test_index)
        labels_train, labels_test = labels[train_index], labels[test_index]
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)

        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            else:
                true_positives += 1
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives

        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        
        precision = 1.0*true_positives/(true_positives+false_positives)
        
        recall = 1.0*true_positives/(true_positives+false_negatives)
        
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
       
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)

        print clf
        print ""
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)

        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf



In [939]:

    
clf = Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), 
                      ('reducer', PCA(copy=True, n_components=4, whiten=True)), 
                      ('classifier', LogisticRegression(C=10, class_weight='auto',
                                                        dual=False, fit_intercept=True,
                                                        intercept_scaling=1, penalty='l2',
                                                        random_state=None, tol=0.0001))])



In [940]:

    
#test_classifier(clf, None, None, folds=1000)
test_classifier(grid_search.best_estimator_, None, None, folds=1000)









    



Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('selection', SelectKBest(k=20, score_func=<function f_classif at 0x0000000016AA6C88>)), ('reducer', PCA(copy=True, n_components=1, whiten=True)), ('classifier', LogisticRegression(C=1, class_weight={False: 1, True: 10}, dual=False,
          fit_intercept=True, intercept_scaling=1, penalty='l2',
          random_state=None, tol=0.1))])

	Accuracy: 0.86667	Precision: 0.50000	Recall: 1.00000	F1: 0.66667	F2: 0.83333
	Total predictions: 15000	True positives: 2000	False positives: 2000	False negatives:    0	True negatives: 11000



In [784]:

    
#test_classifier(clf, None, None, folds=1000)



In [ ]:



In [ ]:

    
#!/usr/bin/python

import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary'] # You will need to use more features

### Load the dictionary containing the dataset
data_dict = pickle.load(open("final_project_dataset.pkl", "r") )

### Task 2: Remove outliers
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()    # Provided to give you a starting point. Try a varity of classifiers.

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script.
### Because of the small size of the dataset, the script uses stratified
### shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

test_classifier(clf, my_dataset, features_list)

### Dump your classifier, dataset, and features_list so 
### anyone can run/check your results.

dump_classifier_and_data(clf, my_dataset, features_list)



In [ ]:

	Features	Scores
0	exercised_stock_options_squared	25.04327
1	total_stock_value	22.78211
2	exercised_stock_options	22.61053
3	bonus	21.06
4	bonus_total_pay_ratio	20.98877
5	salary	18.5757
6	total_compensation	17.18271
7	long_term_incentive_total_pay_ratio	14.01403
8	salary_squared	13.75712
9	deferred_income	11.56189
10	bonus_squared	10.69425
11	long_term_incentive	10.07245
12	deferred_income_squared	10.01498
13	total_payments	9.380237
14	restricted_stock	8.964964
15	total_poi_interaction	8.773847
16	shared_receipt_with_poi	8.746486
17	loan_advances_squared	7.307514
18	loan_advances	7.24273
19	bonus_total_compensation_ratio	6.731432
20	shared_poi_from_messages_ratio	5.793909
21	expenses	5.550684
22	loan_advances_total_pay_ratio	5.396396
23	from_poi_to_this_person	5.344942
24	from_messages_from_poi_ratio	5.20965
25	shared_receipt_with_poi_squared	4.979852
26	total_active_poi_interaction	4.955198
27	bonus_by_total_stock	4.920968
28	other_squared	4.828893
29	restricted_stock_squared	4.794726
...	...	...
41	director_fees_squared	1.913096
42	deferral_payments_total_pay_ratio	1.779904
43	to_messages	1.698824
44	from_poi_to_this_person_total_poi_int_ratio	1.429218
45	from_this_person_to_poi_total_poi_int_ratio	1.245077
46	expenses_total_compensation_ratio	1.196422
47	restricted_stock_total_stock_ratio	1.128587
48	to_poi_total_active_poi_ratio	1.115302
49	deferral_payments_total_compensation_ratio	1.085361
50	expenses_squared	0.776179
51	restricted_stock_deferred	0.7434934
52	other_total_pay_ratio	0.7191198
53	from_poi_to_this_person_squared	0.6159672
54	restricted_stock_deferred_squared	0.311332
55	director_fees_total_pay_ratio	0.2215513
56	deferral_payments	0.2212145
57	director_fees_total_compensation_ratio	0.2199087
58	deferral_payments_squared	0.1872097
59	from_messages	0.1641645
60	restricted_stock_deferred_total_compensation_r...	0.142
61	expenses_total_pay_ratio	0.1220853
62	exercised_stock_options_total_stock_ratio	0.06276039
63	restricted_stock_total_compensation_ratio	0.04006004
64	exercised_stock_options_total_compensation_ratio	0.03827046
65	other_total_compensation_ratio	0.02193787
66	shared_poi_total_compensation	0.006134506
67	salary_total_compensation_ratio	0.002550889
68	deferred_income_total_pay_ratio	NaN
69	restricted_stock_deferred_total_stock_ratio	NaN
70	deferred_income_total_compensation_ratio	NaN

	poi	director_fees_total_pay_ratio	director_fees	total_payments
ALLEN PHILLIP K	False	0.000000	0	4484442
BADUM JAMES P	False	0.000000	0	182466
BANNANTINE JAMES M	False	0.000000	0	916197
BAXTER JOHN C	False	0.000000	0	5634343
BAY FRANKLIN R	False	0.000000	0	827696
BAZELIDES PHILIP J	False	0.000000	0	860136
BECK SALLY W	False	0.000000	0	969068
BELDEN TIMOTHY N	True	0.000000	0	5501630
BELFER ROBERT	False	31.202435	102500	3285
BERBERIAN DAVID	False	0.000000	0	228474
BERGSIEKER RICHARD P	False	0.000000	0	618850
BHATNAGAR SANJAY	False	0.000000	0	137864
BIBI PHILIPPE A	False	0.000000	0	2047593
BLACHMAN JEREMY M	False	0.000000	0	2014835
BLAKE JR. NORMAN P	False	88.963253	113784	1279
BOWEN JR RAYMOND M	True	0.000000	0	2669589
BROWN MICHAEL	False	0.000000	0	49288
BUCHANAN HAROLD G	False	0.000000	0	1054637
BUTTS ROBERT H	False	0.000000	0	1271582
BUY RICHARD B	False	0.000000	0	2355702
CALGER CHRISTOPHER F	True	0.000000	0	1639297
CARTER REBECCA C	False	0.000000	0	477557
CAUSEY RICHARD A	True	0.000000	0	1868758
CHAN RONNIE	False	inf	98784	0
CHRISTODOULOU DIOMEDES	False	inf	0	0
CLINE KENNETH W	False	inf	0	0
COLWELL WESLEY	True	0.000000	0	1490344
CORDES WILLIAM R	False	inf	0	0
COX DAVID	False	0.000000	0	1101393
CUMBERLAND MICHAEL S	False	0.000000	0	807956
...	...	...	...	...
SCRIMSHAW MATTHEW	False	inf	0	0
SHANKMAN JEFFREY A	False	0.000000	0	3038702
SHAPIRO RICHARD S	False	0.000000	0	1057548
SHARP VICTORIA T	False	0.000000	0	1576511
SHELBY REX	True	0.000000	0	2003885
SHERRICK JEFFREY B	False	inf	0	0
SHERRIFF JOHN R	False	0.000000	0	4335388
SKILLING JEFFREY K	True	0.000000	0	8682716
STABLER FRANK	False	0.000000	0	1112087
SULLIVAN-SHAKLOVITZ COLLEEN	False	0.000000	0	999356
SUNDE MARTIN	False	0.000000	0	1545059
TAYLOR MITCHELL S	False	0.000000	0	1092663
THE TRAVEL AGENCY IN THE PARK	False	0.000000	0	362096
THORN TERENCE H	False	0.000000	0	911453
TILNEY ELIZABETH A	False	0.000000	0	399393
UMANOFF ADAM S	False	0.000000	0	1130461
URQUHART JOHN A	False	0.160354	36666	228656
WAKEHAM JOHN	False	0.512965	109298	213071
WALLS JR ROBERT H	False	0.000000	0	1798780
WALTERS GARETH W	False	0.000000	0	87410
WASAFF GEORGE	False	0.000000	0	1034395
WESTFAHL RICHARD K	False	0.000000	0	762135
WHALEY DAVID A	False	inf	0	0
WHALLEY LAWRENCE G	False	0.000000	0	4677574
WHITE JR THOMAS E	False	0.000000	0	1934359
WINOKUR JR. HERBERT S	False	1.277520	108579	84992
WODRASKA JOHN	False	0.000000	0	189583
WROBEL BRUCE	False	inf	0	0
YEAGER F SCOTT	True	0.000000	0	360300
YEAP SOON	False	0.000000	0	55097

	salary	to_messages	deferral_payments	total_payments	exercised_stock_options	bonus	restricted_stock	shared_receipt_with_poi	total_stock_value	...	from_this_person_to_poi_total_poi_int_ratio	from_poi_to_this_person_total_poi_int_ratio	total_active_poi_interaction	to_poi_total_active_poi_ratio	from_poi_total_active_poi_ratio	to_messages_to_poi_ratio	from_messages_from_poi_ratio	shared_poi_from_messages_ratio	shared_poi_total_compensation	bonus_by_total_stock
BELDEN TIMOTHY N	213999	7991	2144013	5501630	953136	5249999	157569	5521	1110705	...	0.018439	0.038928	336	0.321429	0.678571	0.013515	0.471074	11.407025	0.000835	4.726727
BOWEN JR RAYMOND M	278601	1858	0	2669589	0	1350000	252055	1593	252055	...	0.008581	0.080092	155	0.096774	0.903226	0.008073	5.185185	59.000000	0.000545	5.355974
CALGER CHRISTOPHER F	240189	2598	0	1639297	0	1250000	126027	2188	126027	...	0.010365	0.082504	224	0.111607	0.888393	0.009623	1.381944	15.194444	0.001239	9.918510
CAUSEY RICHARD A	415189	1892	0	1868758	0	1000000	2502063	1585	2502063	...	0.007251	0.035045	70	0.171429	0.828571	0.006342	1.183673	32.346939	0.000363	0.399670
COLWELL WESLEY	288542	1758	27610	1490344	0	1200000	698242	1132	698242	...	0.007954	0.173536	251	0.043825	0.956175	0.006257	6.000000	28.300000	0.000517	1.718602
DELAINEY DAVID W	365163	3093	0	4747979	2291113	3000000	1323148	2097	3614261	...	0.219697	0.023810	675	0.902222	0.097778	0.196896	0.021505	0.683284	0.000251	0.830045
FASTOW ANDREW S	440698	0	0	2424083	0	1300000	1794412	0	1794412	...	inf	inf	0	inf	inf	inf	inf	inf	0.000000	0.724471
GLISAN JR BEN F	274975	873	0	1272284	384728	600000	393818	874	778546	...	0.006438	0.055794	58	0.103448	0.896552	0.006873	3.250000	54.625000	0.000426	0.770667
HANNON KEVIN P	243293	1045	0	288682	5538001	1500000	853064	1035	6391065	...	0.019301	0.029412	53	0.396226	0.603774	0.020096	1.000000	32.343750	0.000155	0.234703
HIRKO JOSEPH	0	0	10259	91093	30766064	0	0	0	30766064	...	inf	inf	0	inf	inf	inf	inf	inf	0.000000	0.000000
KOENIG MARK E	309946	2374	0	1587421	671737	700000	1248318	2271	1920055	...	0.006413	0.022659	68	0.220588	0.779412	0.006318	0.868852	37.229508	0.000647	0.364573
KOPPER MICHAEL J	224305	0	0	2652612	0	800000	985032	0	985032	...	inf	inf	0	inf	inf	inf	inf	inf	0.000000	0.812156
LAY KENNETH L	1072321	4273	202911	103559793	34348384	7000000	14761694	2411	49110078	...	0.006275	0.048235	139	0.115108	0.884892	0.003744	3.416667	66.972222	0.000016	0.142537
RICE KENNETH D	420636	905	0	505050	19794175	1750000	2748364	864	22542539	...	0.004396	0.046154	46	0.086957	0.913043	0.004420	2.333333	48.000000	0.000037	0.077631
RIEKER PAULA H	249201	1328	214678	1099100	1635238	700000	283649	1258	1918887	...	0.035794	0.026100	83	0.578313	0.421687	0.036145	0.426829	15.341463	0.000417	0.364795
SHELBY REX	211844	225	0	2003885	1624396	200000	869220	91	2493616	...	0.118644	0.110169	27	0.518519	0.481481	0.062222	0.333333	2.333333	0.000020	0.080205
SKILLING JEFFREY K	1111258	3627	0	8682716	19250000	5600000	6843672	2042	26093672	...	0.013889	0.040741	118	0.254237	0.745763	0.008271	0.814815	18.907407	0.000059	0.214611
YEAGER F SCOTT	158403	0	0	360300	8308552	0	3576206	0	11884758	...	inf	inf	0	inf	inf	inf	inf	inf	0.000000	0.000000